library(mixOmics)
library(metagenomeSeq)
library(reshape2)
library(ggplot2)

Data with sick samples preparation

df_abundances <- read.delim("../data/abondances.csv", sep = ",", 
                            stringsAsFactors = FALSE)
df_abundances[df_abundances[ ,1] == "&",1] <- "Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Mannheimia;Mannheimia haemolytica"
abundances <- df_abundances[ ,grep("A_", colnames(df_abundances))] +
  df_abundances[ ,grep("B_", colnames(df_abundances))]
dim(abundances)
## [1] 406  45
condition <- rep("LBA", ncol(abundances))
condition[grep("EN", colnames(abundances))] <- "EN"
condition <- factor(condition)
table(condition)
## condition
##  EN LBA 
##  22  23
id_abundances <- as.character(colnames(abundances))
id_abundances <- sapply(id_abundances, function(ac) 
  substr(ac, nchar(ac) - 19, nchar(ac) - 18))
id_abundances <- gsub("A", "0", id_abundances)
id_abundances <- gsub("N", "0", id_abundances)
table(id_abundances)
## id_abundances
## 01 03 04 06 07 09 10 11 15 16 17 18 19 20 21 23 24 25 26 28 29 30 31 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  1  2  2
species <- sapply(df_abundances[ ,1], function(aname) 
  unlist(strsplit(aname, ";")), simplify = FALSE)
species <- sapply(species, function(avect) {
  find_unknown <- grep("unknown", avect)
  if (length(find_unknown) > 0) {
    return(avect[-find_unknown])
  } else return(avect)
})
species <- unlist(sapply(species, function(avect) avect[length(avect)]))
species <- unname(species)

abundances <- apply(abundances, 2, function(acol) tapply(acol, species, sum))
species <- rownames(abundances)
colnames(abundances) <- paste(condition,id_abundances,sep='_')

Pathogene data preparation

df_pathogenes <- read.delim("../data/pathogenes.csv", sep = ",")
dim(df_pathogenes)
## [1] 46  9
pathogenes <- df_pathogenes[ ,-c(1,9)]
pathogenes <- as.data.frame(ifelse(pathogenes == "p", 1, 0))
pathogenes <- as.data.frame(apply(t(pathogenes), 1, as.factor))
id_pathogenes <- df_pathogenes[ ,1]
id_pathogenes <- gsub("-", "0", id_pathogenes)
id_pathogenes <- gsub(" ", "", id_pathogenes)
id_pathogenes <- sapply(as.character(id_pathogenes), function(aname) {
  substr(aname, nchar(aname)-1, nchar(aname))
})
df_pathogenes[ ,9] <- ifelse(df_pathogenes[ ,9] == "ENP", "EN", "LBA")
id_pathogenes <- paste0(id_pathogenes, "_", df_pathogenes[ ,9])

The list of farm names is not in the same order than in the abundance file. Both ordering are matched according to the abundance order (one sample is thus excluded from the analysis because the abundance data are not available):

pathogenes <- pathogenes[match(paste0(id_abundances, "_", condition),
                               id_pathogenes), ] 
rownames(pathogenes) <- paste0(id_abundances, "_", condition)

Preprocessing of control samples

df_abundances_ctrl <- read.delim("../data/abondances-ctrl.csv", sep = ",", 
                            stringsAsFactors = FALSE)
summary(df_abundances_ctrl[ ,1:15])
##  X.blast_taxonomy   blast_subject      blast_perc_identity
##  Length:293         Length:293         Min.   : 98.08     
##  Class :character   Class :character   1st Qu.: 99.58     
##  Mode  :character   Mode  :character   Median :100.00     
##                                        Mean   : 99.72     
##                                        3rd Qu.:100.00     
##                                        Max.   :100.00     
##  blast_perc_query_coverage  blast_evalue blast_aln_length
##  Min.   : 96.66            Min.   :0     Min.   :430.0   
##  1st Qu.:100.00            1st Qu.:0     1st Qu.:466.0   
##  Median :100.00            Median :0     Median :479.0   
##  Mean   : 99.97            Mean   :0     Mean   :475.4   
##  3rd Qu.:100.00            3rd Qu.:0     3rd Qu.:490.0   
##  Max.   :100.00            Max.   :0     Max.   :512.0   
##    seed_id          observation_name   observation_sum
##  Length:293         Length:293         Min.   :    0  
##  Class :character   Class :character   1st Qu.:   57  
##  Mode  :character   Mode  :character   Median :  137  
##                                        Mean   : 1014  
##                                        3rd Qu.:  364  
##                                        Max.   :97278  
##  NG.13669_IDV1EN_lib206978_5685_2 NG.13669_IDV1LBA_lib206979_5685_2
##  Min.   :    0.00                 Min.   :    0.00                 
##  1st Qu.:    0.00                 1st Qu.:    0.00                 
##  Median :    2.00                 Median :    0.00                 
##  Mean   :   84.53                 Mean   :   84.53                 
##  3rd Qu.:   18.00                 3rd Qu.:    0.00                 
##  Max.   :11964.00                 Max.   :14102.00                 
##  NG.13669_IDV2EN_lib206980_5685_2 NG.13669_IDV2LBA_lib206981_5685_2
##  Min.   :   0.00                  Min.   :    0.00                 
##  1st Qu.:   0.00                  1st Qu.:    0.00                 
##  Median :   4.00                  Median :    0.00                 
##  Mean   :  84.53                  Mean   :   84.53                 
##  3rd Qu.:  40.00                  3rd Qu.:    0.00                 
##  Max.   :2946.00                  Max.   :10748.00                 
##  NG.13669_S2EN_lib206984_5685_2 NG.13669_S2LBA_lib206985_5685_2
##  Min.   :   0.00                Min.   :   0.00                
##  1st Qu.:   0.00                1st Qu.:   0.00                
##  Median :   2.00                Median :   0.00                
##  Mean   :  84.53                Mean   :  84.53                
##  3rd Qu.:  15.00                3rd Qu.:   0.00                
##  Max.   :8065.00                Max.   :6340.00
which_selected <- grep("LBA_|EN_", colnames(df_abundances_ctrl))
abundances_ctrl <- df_abundances_ctrl[ ,which_selected]
dim(abundances_ctrl)
## [1] 293  12
condition_ctrl <- rep("LBA", ncol(abundances_ctrl))
condition_ctrl[grep("EN", colnames(abundances_ctrl))] <- "EN"
condition_ctrl <- factor(condition_ctrl)
table(condition_ctrl)
## condition_ctrl
##  EN LBA 
##   6   6
id_abundances_ctrl=rep(1:6, each = 2)
colnames(abundances_ctrl)=paste(condition_ctrl, id_abundances_ctrl, sep = "_")

Unique species are extracted similarly as for case samples:

species_ctrl <- as.data.frame(sapply(df_abundances_ctrl[ ,1], function(aname) 
  unlist(strsplit(aname, ";"))))
species_ctrl <- sapply(species_ctrl, function(avect) {
  find_unknown <- grep("unknown", avect)
  if (length(find_unknown) > 0) {
    return(avect[-find_unknown])
  } else return(avect)
})
species_ctrl <- unlist(sapply(species_ctrl, function(avect) avect[length(avect)]))
species_ctrl <- unname(species_ctrl)
species_ctrl <- sapply(species_ctrl, as.character)
abundances_ctrl <- apply(abundances_ctrl, 2, function(acol) tapply(acol, species_ctrl, sum))
species_ctrl <- rownames(abundances_ctrl)

Basic exploratory analyses

Exploratory analysis: comparison between case and control samples

In this first part of the exploratory analyses, a comparison of case and control samples is performed in terms of total count and variety of species.

df <- data.frame("total" = c(apply(abundances, 2, sum), 
                             apply(abundances_ctrl, 2, sum)),
                 "condition" = c(condition, condition_ctrl),
                 "type" = rep(c("case", "control"),
                              c(ncol(abundances), ncol(abundances_ctrl))),
                 "sample" = c(paste("case", condition, id_abundances, sep = "_"),
                              paste("ctrl", colnames(abundances_ctrl), sep = "_")))
p <- ggplot(df, aes(x = sample, weight = total, fill = type)) + geom_bar() +
  theme_bw() + ylab("total count") + xlab("samples") +
  theme(axis.text.x = element_blank())
p

Control samples seem to have been badly normalized with two samples with a much larger total count. Moreover, the total count between case and control samples is very different, with a much smaller total count for control samples.

plot.new()
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
vd <- venn.diagram(list("case" = species, "control" = species_ctrl), 
                   fill = brewer.pal(3, "Set2")[1:2], 
                   cat.col = "black", cat.cex = 1.8,cex = 1.7,cat.dist=-0.05, 
                   fontface = "bold", filename = NULL)
grid.draw(vd)

## null device 
##           1
## [1] TRUE

Case and control samples only have 117 bacteria in common. Extracting count data with only this common set of bacteria lead to the following comparison in terms of total count:

common_species <- intersect(species, species_ctrl)
write.table(common_species, file = "../data/common_species.txt", 
            row.names = FALSE, col.names = FALSE)
common_case <- abundances[common_species, ]
common_ctrl <- abundances_ctrl[common_species, ]
df <- data.frame("total" = c(apply(common_case, 2, sum), 
                             apply(common_ctrl, 2, sum)),
                 "condition" = c(condition, condition_ctrl),
                 "type" = rep(c("case", "control"),
                              c(ncol(abundances), ncol(abundances_ctrl))),
                 "sample" = c(paste("case", condition, id_abundances, sep = "_"),
                              paste("ctrl", colnames(abundances_ctrl), sep = "_")))
p <- ggplot(df, aes(x = sample, weight = total, fill = type)) + geom_bar() +
  theme_bw(base_size = 15) + ylab("total count") + xlab("samples") +
  theme(axis.text.x = element_blank())
p

that shows an even larger differences between total counts when only common species are selected.

Differences between EN and LBA samples

The first part of the analysis aims at finding signatures that are specific to EN / LBA samples. This is performed using two types of preprocessing and paired or unpaired analyses. The used method is sparse PLS-DA.

With log-transformed counts (paired analysis with ‘multilevel’, scaled analysis)

A first PLS-DA is computed (with 10-fold CV) to check the efficiency of the method and which type of distance to use in its computation.

set.seed(11)
res_plsda <- plsda(log(t(abundances)+1), condition, ncomp = nlevels(condition))
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 10,
                  progressBar = FALSE, nrepeat = 100)
plot(res_perf, overlay = 'measure', sd = TRUE)

plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'PLS-DA Comp 1 - 2',          
          size.title = rel(2),
          size.ylabel = rel(1.5),
          size.xlabel = rel(1.5),
          size.legend.title = rel(1.5))

PLS-DA shows a good separation between the two groups and indicates that the Mahalanobis distance provides the lower overall classification error.

Then, sparse PLS-DA is used (with the multilevel approach) to check which number of components to select.

clean_log <- data.frame(log(t(abundances[ ,id_abundances != "29"]) + 1))
names(clean_log) <- species
clean_condition <- factor(condition[id_abundances != "29"])
clean_id <- factor(id_abundances[id_abundances != "29"])

set.seed(33)
res_plsda <- tune.splsda(clean_log, clean_condition, 
                         ncomp = nlevels(clean_condition),
                         multilevel = clean_id,
                         test.keepX = 1:20, validation = 'Mfold', folds = 10, 
                         dist = 'mahalanobis.dist', nrepeat = 10,
                         progressBar = FALSE)
## Splitting the variation for 1 level factor.
plot(res_plsda)

sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2 
##     5    14
res_splsda <- splsda(clean_log, clean_condition, 
                     ncomp = nlevels(clean_condition), multilevel = clean_id,
                     keepX = sel_keepX)
## Splitting the variation for 1 level factor.
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'sPLS-DA Comp 1 - 2',
          size.title = rel(2),
          size.ylabel = rel(1.5),
          size.xlabel = rel(1.5),
          size.legend.title = rel(1.5))

head(selectVar(res_splsda, comp = 1)$value)
##                                 value.var
## Mycoplasma bovoculi M165/69    -0.6975800
## Corynebacterium camporealensis -0.5978130
## Moraxella                      -0.3111287
## Aquamicrobium                  -0.1832142
## Peptoclostridium               -0.1601039
par(mfrow=c(1,2))
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
             size.title = 1.3, size.xlabel = rel(1.5),legend = F)
plotLoadings(res_splsda, comp = 2, method = 'mean', contrib = 'max',
             size.title = 1.3,legend = F)

par(mfrow=c(1,1))

Tests on the extracted bacteria

In this section two complementary analyses are performed: a Student test to assess whether the found bacteria are differentially abundant between the two conditions and boxplots to visualize the abundance differences.

selected <- selectVar(res_splsda, comp = 1)$name
p<-list()
for (ind in seq_along(selected)) {
  df <- data.frame(counts = abundances[selected[ind], ], 
                   condition = condition)
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 13) + scale_y_log10() + 
    scale_fill_manual(values = c("dodgerblue2","darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  #print(p[ind])
}
multiplot(plotlist =p,cols=2)

all_pvals <- sapply(seq_along(selected), function(ind) {
  var_en <- clean_log[clean_condition == "EN",selected[ind]]
  var_lba <- clean_log[clean_condition == "LBA",selected[ind]]
  pval <- t.test(var_en, var_lba, paired = TRUE)$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
                  FDR = p.adjust(all_pvals, method = "BH"))
res
##                         bacteria       pvalue          FDR
## 1    Mycoplasma bovoculi M165/69 3.728266e-12 1.553412e-11
## 2 Corynebacterium camporealensis 6.213648e-12 1.553412e-11
## 3                      Moraxella 2.367632e-11 3.946053e-11
## 4                  Aquamicrobium 4.077402e-11 4.484280e-11
## 5               Peptoclostridium 4.484280e-11 4.484280e-11

All tests are found positive: those bacteria are found differentially present in EN/LBA samples.

Moreover, among the selected species, those that are also found in the control samples are:

also_in_common <- intersect(selected, common_species)
also_in_common
## [1] "Corynebacterium camporealensis" "Aquamicrobium"                 
## [3] "Peptoclostridium"

For those species the comparison of the total count and relative abundance are provided in the following boxplots:

#p <- list()
for (ind in seq_along(also_in_common)) {
  df <- data.frame(counts = c(abundances[also_in_common[ind], ],
                              abundances_ctrl[also_in_common[ind], ]), 
                   condition = factor(c(as.character(condition),
                                        as.character(condition_ctrl))),
                   type = rep(c("case", "control"), 
                              c(ncol(abundances), ncol(abundances_ctrl))))
  p <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 15) + scale_y_log10() + 
    scale_fill_manual(values = c("dodgerblue2", "darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)")) +
    facet_grid( ~ type)
  print(p)
}

#multiplot(plotlist =p,cols=3)

That shows that, exept for Micoplasma bovoculi, the differences between LBA and EN (in terms of counts) is not related to the status (case/control) of the sample.

case_counts <- apply(abundances, 2, sum)
ctrl_counts <- apply(abundances_ctrl, 2, sum)
for (ind in seq_along(also_in_common)) {
  freq_case <- abundances[also_in_common[ind], ] / case_counts
  freq_ctrl <- abundances_ctrl[also_in_common[ind], ] / ctrl_counts
  df <- data.frame(counts = c(freq_case, freq_ctrl), 
                   condition = factor(c(as.character(condition),
                                        as.character(condition_ctrl))),
                   type = rep(c("case", "control"), 
                              c(ncol(abundances), ncol(abundances_ctrl))))
  p <- ggplot(df, aes(x = condition, y = counts, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 15) + 
    scale_fill_manual(values = c("dodgerblue2", "darkorange")) +
    ggtitle(selected[ind]) + ylab("relative abundance") +
    facet_grid( ~ type)
  print(p)
}

that confirms the previous conclusion.

With log-transformed counts (paired analysis with ‘multilevel’, unscaled analysis)

The same analysis is performed without scaling the data.

set.seed(11)
res_plsda <- plsda(log(t(abundances)+1), condition, ncomp = nlevels(condition),
                   scale = FALSE)
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,   # erreur si folds=10
                  progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)

plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'PLS-DA Comp 1 - 2')

PLS-DA shows a good separation between the two groups and indicates that the Mahalanobis distance provides the lower overall classification error.

Then, sparse PLS-DA is used (with the multilevel approach) to check which number of components to select.

clean_log <- data.frame(log(t(abundances[ ,id_abundances != "29"]) + 1))
names(clean_log) <- species
clean_condition <- factor(condition[id_abundances != "29"])
clean_id <- factor(id_abundances[id_abundances != "29"])

set.seed(33)
res_plsda <- tune.splsda(clean_log, clean_condition, 
                         ncomp = nlevels(clean_condition),
                         multilevel = clean_id,
                         test.keepX = 1:20, validation = 'Mfold', folds = 10, 
                         dist = 'mahalanobis.dist', nrepeat = 10,
                         progressBar = FALSE, scale = FALSE)
## Splitting the variation for 1 level factor.
plot(res_plsda)

sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2 
##     4     1

Finally sparse PLS-DA is performed and the variables explaining the two types of samples are obtained:

res_splsda <- splsda(clean_log, clean_condition, 
                     ncomp = nlevels(clean_condition), multilevel = clean_id,
                     keepX = sel_keepX, scale = FALSE)
## Splitting the variation for 1 level factor.
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'sPLS-DA Comp 1 - 2')

head(selectVar(res_splsda, comp = 1)$value)
##                               value.var
## Streptococcus pluranimalium -0.61298859
## Mycoplasma bovoculi M165/69 -0.56123953
## Moraxella                   -0.54756384
## Mannheimia sp.              -0.09710312
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
             size.title = 1)

Tests on the extracted bacteria

In this section two complementary analyses are performed: a Student test to assess whether the found bacteria are differentially abundant between the two conditions and boxplots to visualize the abundance differences.

selected <- selectVar(res_splsda, comp = 1)$name
for (ind in seq_along(selected)) {
  df <- data.frame(counts = abundances[selected[ind], ], 
                   condition = condition)
  p <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw() + scale_y_log10() + 
    scale_fill_manual(values = c("dodgerblue2","darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  print(p)
}

all_pvals <- sapply(seq_along(selected), function(ind) {
  var_en <- clean_log[clean_condition == "EN",selected[ind]]
  var_lba <- clean_log[clean_condition == "LBA",selected[ind]]
  pval <- t.test(var_en, var_lba, paired = TRUE)$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
                  FDR = p.adjust(all_pvals, method = "BH"))
res
##                      bacteria       pvalue          FDR
## 1 Streptococcus pluranimalium 3.738224e-08 3.738224e-08
## 2 Mycoplasma bovoculi M165/69 3.728266e-12 1.491306e-11
## 3                   Moraxella 2.367632e-11 4.735264e-11
## 4              Mannheimia sp. 4.288368e-09 5.717824e-09

Predicting the presence of viruses

This section tries to make a link between abundance dataset and virus presence. It is divided into two parts: the first one performs a multivariate analysis relating the presence or absence of any virus to the abundance data. The second one aims at predicting a specific group of three viruses from abundance data.

Definition of a group of virus

In the sequel, the presence/absence of viruses is predicted from abundance data. Since most virus presence is very rare (with highly unbalanced datasets), a group of three viruses RSV, PI.3, Coronavirus is defined and used as a target for prediction.

The factor corresponding to these three virus presence is thus defined:

group_virus <- apply(sapply(pathogenes, as.numeric) - 1, 2, as.logical)
group_virus <- (group_virus[ ,1]) | (group_virus[ ,2]) | (group_virus[ ,3])
group_virus <- as.factor(as.numeric(group_virus))
table(group_virus)
## group_virus
##  0  1 
## 18 27

PLS-DA in EN condition (log-transformed counts)

sel_samples <- which(condition == "EN")
set.seed(11)
res_plsda <- plsda(X = log(t(abundances)+1)[sel_samples, ],
                   Y = group_virus[sel_samples], 
                   ncomp = nlevels(group_virus))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
                 progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)

plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN',
          size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
          size.legend = rel(1))

set.seed(33)
res_plsda <- tune.splsda(X = log(t(abundances)+1)[sel_samples, ],
                         Y = group_virus[sel_samples], 
                         ncomp = nlevels(group_virus),
                         test.keepX = 1:20, validation = 'Mfold', 
                         folds = 5, dist = 'mahalanobis.dist', nrepeat = 10,
                         progressBar = FALSE)
plot(res_plsda)

sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2 
##     2     6
res_splsda <- splsda(X = log(t(abundances)+1)[sel_samples, ],
                     Y = group_virus[sel_samples], ncomp = nlevels(group_virus),
                     keepX = c(20,20)) ## using results from "nrepeat=100"
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'sPLS-DA (RSV, PI.3 and Coronavirus) - EN',
          size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
          size.legend = rel(1))

selectVar(res_splsda, comp = 1)$name
##  [1] "Leucobacter"                 "Corynebacterium confusum"   
##  [3] "Pseudochrobactrum"           "Gelidibacter"               
##  [5] "Streptococcus gallolyticus"  "Moraxella lacunata"         
##  [7] "Gracilibacteria"             "Ochrobactrum sp."           
##  [9] "Arthrobacter sp."            "Trueperella pyogenes"       
## [11] "Faecalicoccus pleomorphus"   "Phyllobacteriaceae"         
## [13] "Propionibacteriaceae"        "Helcococcus ovis"           
## [15] "Sporosarcina sp."            "Olivibacter"                
## [17] "Chryseobacterium sp."        "Delftia sp."                
## [19] "Mycoplasma bovoculi M165/69" "Lachnoclostridium"
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
             size.title = rel(1.5),size.axis = rel(1.5),size.legend = rel(1))

selected <- selectVar(res_splsda, comp = 1)$name
p <- list()
for (ind in seq_along(selected[1:10])) {
  df <- data.frame(counts = abundances[selected[ind],sel_samples], 
                   condition = group_virus[sel_samples])
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw() + scale_y_log10() + 
    scale_fill_manual(values = c("dodgerblue2","darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  #print(p)
}
multiplot(plotlist=p,cols=2)

all_pvals <- sapply(seq_along(selected), function(ind) {
  all_data <- log(abundances[selected[ind],sel_samples] + 1)
  pval <- t.test(all_data ~ group_virus[sel_samples])$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
                  FDR = p.adjust(all_pvals, method = "BH"))
res
##                       bacteria     pvalue        FDR
## 1                  Leucobacter 0.03481991 0.08704976
## 2     Corynebacterium confusum 0.03045623 0.08704976
## 3            Pseudochrobactrum 0.03477661 0.08704976
## 4                 Gelidibacter 0.01618017 0.08704976
## 5   Streptococcus gallolyticus 0.02026699 0.08704976
## 6           Moraxella lacunata 0.01911170 0.08704976
## 7              Gracilibacteria 0.02123296 0.08704976
## 8             Ochrobactrum sp. 0.07461385 0.12304604
## 9             Arthrobacter sp. 0.09038226 0.12304604
## 10        Trueperella pyogenes 0.04204057 0.09342350
## 11   Faecalicoccus pleomorphus 0.02917639 0.08704976
## 12          Phyllobacteriaceae 0.08753570 0.12304604
## 13        Propionibacteriaceae 0.15041696 0.16178188
## 14            Helcococcus ovis 0.13733066 0.16178188
## 15            Sporosarcina sp. 0.05414143 0.10828286
## 16                 Olivibacter 0.09228453 0.12304604
## 17        Chryseobacterium sp. 0.14258569 0.16178188
## 18                 Delftia sp. 0.16064938 0.16178188
## 19 Mycoplasma bovoculi M165/69 0.06573544 0.11951897
## 20           Lachnoclostridium 0.16178188 0.16178188

None of the bacteria are found differentially abundant between infected and non infected samples.

Moreover, among the selected species, those that are also found in the control samples are:

also_in_common <- intersect(selected, common_species)
also_in_common
## [1] "Leucobacter"               "Gelidibacter"             
## [3] "Ochrobactrum sp."          "Arthrobacter sp."         
## [5] "Trueperella pyogenes"      "Faecalicoccus pleomorphus"
## [7] "Helcococcus ovis"          "Sporosarcina sp."         
## [9] "Delftia sp."

For those species the comparison of the total count and relative abundance are provided in the following boxplots:

viruses <- c("no virus", "virus")[as.numeric(group_virus[sel_samples])]
sel_ctrl <- which(condition_ctrl == "EN")
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
  df <- data.frame(counts = c(abundances[also_in_common[ind],sel_samples],
                              abundances_ctrl[also_in_common[ind],sel_ctrl]), 
                   condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 15) + scale_y_log10() + 
    scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  #print(p)
}
multiplot(plotlist = p,cols=2)

case_counts <- apply(abundances[ ,sel_samples], 2, sum)
ctrl_counts <- apply(abundances_ctrl[ ,sel_ctrl], 2, sum)
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
  freq_case <- abundances[also_in_common[ind],sel_samples] / case_counts
  freq_ctrl <- abundances_ctrl[also_in_common[ind],sel_ctrl] / ctrl_counts
  df <- data.frame(counts = c(freq_case, freq_ctrl), 
                   condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 15) + 
    scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
    ggtitle(selected[ind]) + ylab("relative abundance")
  #print(p)
}
multiplot(plotlist = p,cols=2)

PLS-DA in EN condition (log-transformed counts, unscaled)

sel_samples <- which(condition == "EN")
set.seed(11)
res_plsda <- plsda(X = log(t(abundances)+1)[sel_samples, ],
                   Y = group_virus[sel_samples], 
                   ncomp = nlevels(group_virus),
                   scale = FALSE)
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
                 progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)

plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN')

set.seed(33)
res_plsda <- tune.splsda(X = log(t(abundances)+1)[sel_samples, ],
                         Y = group_virus[sel_samples], 
                         ncomp = nlevels(group_virus),
                         test.keepX = 1:20, validation = 'Mfold', 
                         folds = 5, dist = 'mahalanobis.dist', nrepeat = 10,
                         progressBar = FALSE, scale = FALSE)
plot(res_plsda)

sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2 
##     6     2
res_splsda <- splsda(X = log(t(abundances)+1)[sel_samples, ],
                     Y = group_virus[sel_samples], ncomp = nlevels(group_virus),
                     keepX = sel_keepX, scale = FALSE)
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'sPLS-DA (RSV, PI.3 and Coronavirus) - EN')

selectVar(res_splsda, comp = 1)$name
## [1] "Moraxella lacunata"       "Trueperella pyogenes"    
## [3] "Ureaplasma"               "Corynebacterium confusum"
## [5] "Gracilibacteria"          "Leptotrichiaceae"
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
             size.title = 1)

selected <- selectVar(res_splsda, comp = 1)$name
for (ind in seq_along(selected)) {
  df <- data.frame(counts = abundances[selected[ind],sel_samples], 
                   condition = group_virus[sel_samples])
  p <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw() + scale_y_log10() + 
    scale_fill_manual(values = c("dodgerblue2","darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  print(p)
}

all_pvals <- sapply(seq_along(selected), function(ind) {
  all_data <- log(abundances[selected[ind],sel_samples] + 1)
  pval <- t.test(all_data ~ group_virus[sel_samples])$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
                  FDR = p.adjust(all_pvals, method = "BH"))
res
##                   bacteria     pvalue        FDR
## 1       Moraxella lacunata 0.01911170 0.06091245
## 2     Trueperella pyogenes 0.04204057 0.06306086
## 3               Ureaplasma 0.13259862 0.14147549
## 4 Corynebacterium confusum 0.03045623 0.06091245
## 5          Gracilibacteria 0.02123296 0.06091245
## 6         Leptotrichiaceae 0.14147549 0.14147549

PLS-DA in LBA condition (log-transformed counts)

sel_samples <- which(condition == "LBA")
set.seed(11)
res_plsda <- plsda(X = log(t(abundances)+1)[sel_samples, ],
                   Y = group_virus[sel_samples], 
                   ncomp = nlevels(group_virus))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
                 progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)

plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN',
          size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
          size.legend = rel(1))

set.seed(33)
res_plsda <- tune.splsda(X = log(t(abundances)+1)[sel_samples, ],
                         Y = group_virus[sel_samples], 
                         ncomp = nlevels(group_virus),
                         test.keepX = 1:20, validation = 'Mfold', 
                         folds = 5, dist = 'mahalanobis.dist', nrepeat = 10,
                         progressBar = FALSE)
plot(res_plsda)

sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2 
##    15    14
res_splsda <- splsda(X = log(t(abundances)+1)[sel_samples, ],
                     Y = group_virus[sel_samples], ncomp = nlevels(group_virus),
                     keepX = c(20,20)) ## using results from "nrepeat=100"
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'sPLS-DA (RSV, PI.3 and Coronavirus) - EN',
          size.title = rel(2),size.xlabel = rel(1.5),size.ylabel = rel(1.5),
          size.legend = rel(1))

selectVar(res_splsda, comp = 1)$name
##  [1] "Brachybacterium sp."            "Jeotgalicoccus psychrophilus"  
##  [3] "Psychrobacter pulmonis"         "Paracoccus alcaliphilus"       
##  [5] "Staphylococcus arlettae CVD059" "Ruminococcaceae UCG-005"       
##  [7] "Curtobacterium flaccumfaciens"  "Lachnospiraceae NK3A20 group"  
##  [9] "Jeotgalicoccus"                 "Delftia sp."                   
## [11] "Mycoplasma bovis"               "Sphingomonas sp."              
## [13] "Facklamia"                      "Comamonas"                     
## [15] "Trichococcus"                   "Helcococcus ovis"              
## [17] "Nocardiopsis"                   "Leptotrichiaceae"              
## [19] "Thauera"                        "Desemzia incerta"
plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',
             size.title = rel(1.5),size.axis = rel(1.5),size.legend = rel(1))

selected <- selectVar(res_splsda, comp = 1)$name
p <- list()
for (ind in seq_along(selected[1:10])) {
  df <- data.frame(counts = abundances[selected[ind],sel_samples], 
                   condition = group_virus[sel_samples])
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw() + scale_y_log10() + 
    scale_fill_manual(values = c("dodgerblue2","darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  #print(p)
}
multiplot(plotlist=p,cols=2)

all_pvals <- sapply(seq_along(selected), function(ind) {
  all_data <- log(abundances[selected[ind],sel_samples] + 1)
  pval <- t.test(all_data ~ group_virus[sel_samples])$p.value
})
res <- data.frame(bacteria = selected, pvalue = all_pvals,
                  FDR = p.adjust(all_pvals, method = "BH"))
res
##                          bacteria     pvalue       FDR
## 1             Brachybacterium sp. 0.05052448 0.1447919
## 2    Jeotgalicoccus psychrophilus 0.04775828 0.1447919
## 3          Psychrobacter pulmonis 0.04029216 0.1447919
## 4         Paracoccus alcaliphilus 0.06314502 0.1447919
## 5  Staphylococcus arlettae CVD059 0.09985810 0.1521428
## 6         Ruminococcaceae UCG-005 0.10649995 0.1521428
## 7   Curtobacterium flaccumfaciens 0.02126597 0.1447919
## 8    Lachnospiraceae NK3A20 group 0.06462858 0.1447919
## 9                  Jeotgalicoccus 0.08698581 0.1521428
## 10                    Delftia sp. 0.07341817 0.1468363
## 11               Mycoplasma bovis 0.13123811 0.1543978
## 12               Sphingomonas sp. 0.02971395 0.1447919
## 13                      Facklamia 0.10558693 0.1521428
## 14                      Comamonas 0.12787987 0.1543978
## 15                   Trichococcus 0.12570172 0.1543978
## 16               Helcococcus ovis 0.14864534 0.1588076
## 17                   Nocardiopsis 0.03295577 0.1447919
## 18               Leptotrichiaceae 0.06515637 0.1447919
## 19                        Thauera 0.18595844 0.1859584
## 20               Desemzia incerta 0.15086719 0.1588076

None of the bacteria are found differentially abundant between infected and non infected samples.

Moreover, among the selected species, those that are also found in the control samples are:

also_in_common <- intersect(selected, common_species)
also_in_common
##  [1] "Brachybacterium sp."          "Paracoccus alcaliphilus"     
##  [3] "Ruminococcaceae UCG-005"      "Lachnospiraceae NK3A20 group"
##  [5] "Jeotgalicoccus"               "Delftia sp."                 
##  [7] "Sphingomonas sp."             "Facklamia"                   
##  [9] "Comamonas"                    "Trichococcus"                
## [11] "Helcococcus ovis"             "Thauera"

For those species the comparison of the total count and relative abundance are provided in the following boxplots:

viruses <- c("no virus", "virus")[as.numeric(group_virus[sel_samples])]
sel_ctrl <- which(condition_ctrl == "EN")
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
  df <- data.frame(counts = c(abundances[also_in_common[ind],sel_samples],
                              abundances_ctrl[also_in_common[ind],sel_ctrl]), 
                   condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts + 1, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 15) + scale_y_log10() + 
    scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
    ggtitle(selected[ind]) + ylab(expression(log[10] ~ "(count + 1)"))
  #print(p)
}
multiplot(plotlist = p,cols=2)

case_counts <- apply(abundances[ ,sel_samples], 2, sum)
ctrl_counts <- apply(abundances_ctrl[ ,sel_ctrl], 2, sum)
p <- list()
for (ind in seq_along(also_in_common[1:4])) {
  freq_case <- abundances[also_in_common[ind],sel_samples] / case_counts
  freq_ctrl <- abundances_ctrl[also_in_common[ind],sel_ctrl] / ctrl_counts
  df <- data.frame(counts = c(freq_case, freq_ctrl), 
                   condition = factor(c(viruses, rep("control", length(sel_ctrl)))))
  p[[ind]] <- ggplot(df, aes(x = condition, y = counts, fill = condition)) +
    geom_boxplot() + theme_bw(base_size = 15) + 
    scale_fill_manual(values = c("green", "dodgerblue2", "darkorange")) +
    ggtitle(selected[ind]) + ylab("relative abundance")
  #print(p)
}
multiplot(plotlist = p,cols=2)

New data sets with commun bacteria

data_commun_sick <- abundances[which(rownames(abundances)%in%rownames(abundances_ctrl)),]
data_commun_ctrl <- abundances_ctrl[which(rownames(abundances_ctrl)%in%rownames(abundances)),]
data_supp <- cbind.data.frame(data_commun_sick,data_commun_ctrl)

PCA

log_commun_sick<-log(t(data_commun_sick)+1)
log_commun_ctrl<-log(t(data_commun_ctrl)+1)
pca_raw <- pca(log_commun_sick, ncomp = ncol(data_commun_sick), 
               logratio = 'none',scale=TRUE)

Adding supplementary individuals

plotIndiv(pca_raw, 
          comp = c(1,2),
          pch = 16, 
          ind.names = FALSE, 
          group=condition,
          col.per.group = color.mixo(1:2),
          legend = TRUE,
          style="graphics",
          title="PCA for log-tranformed data",cex=1.5,size.xlabel = rel(1.5),size.legend = rel(1.5))
suppIndiv <- ((log_commun_ctrl-mean(log_commun_sick))/sd(log_commun_sick))%*%pca_raw$rotation[,1:2]
points(suppIndiv[,1],suppIndiv[,2],
       col=ifelse(condition_ctrl=='EN',"dodgerblue2","darkorange"),pch=1,cex=1.5)

indiv <- scale(log_commun_sick)%*%pca_raw$rotation[,1:2]
plot(indiv[,1],indiv[,2],col=ifelse(clean_condition=='EN',"dodgerblue2","darkorange"),pch=16,main="PCA")
points(suppIndiv[,1],suppIndiv[,2],
       col=ifelse(condition_ctrl=='EN',"dodgerblue2","darkorange"),pch=1)

sel_samples <- which(condition == "EN")
set.seed(11)
res_plsda <- plsda(X = log_commun_sick[sel_samples, ],
                   Y = group_virus[sel_samples], 
                   ncomp = nlevels(group_virus),
                   scale = TRUE)
res_perf <- perf(res_plsda, validation = 'Mfold', folds = 5,
                 progressBar = FALSE, nrepeat = 10)
plot(res_perf, overlay = 'measure', sd = TRUE)

plotIndiv(res_plsda , comp = c(1, 2), ind.names = FALSE, ellipse = FALSE, 
          legend = TRUE, 
          title = 'PLS-DA (RSV, PI.3 and Coronavirus) - EN',style="graphics")
suppIndiv <- ((log_commun_ctrl[which(condition_ctrl == "EN"),]
                  -mean(log_commun_sick[sel_samples,]))/sd(log_commun_sick[sel_samples,]))%*%res_plsda$loadings$X
points(suppIndiv[,1],suppIndiv[,2],pch=17)

indiv=scale(log_commun_sick[sel_samples,])%*%res_plsda$loadings$X
plot(indiv[,1],indiv[,2],col=ifelse(group_virus[sel_samples]=='0',"dodgerblue2","darkorange"),pch=16)
points(suppIndiv[,1],suppIndiv[,2],pch=17)

PLS-DA

set.seed(33)
res_plsda <- tune.splsda(log_commun_sick[id_abundances != "29",], clean_condition, 
                         ncomp = nlevels(clean_condition),
                         multilevel = clean_id,
                         test.keepX = 1:20, validation = 'Mfold', folds = 10, 
                         dist = 'mahalanobis.dist', nrepeat = 10,
                         progressBar = FALSE,scale=FALSE)   #ajout de scale=FALSE sinon résultats incohérents
## Splitting the variation for 1 level factor.
plot(res_plsda)

sel_keepX <- res_plsda$choice.keepX[1:2]
sel_keepX
## comp1 comp2 
##     7    11
res_splsda <- splsda(log_commun_sick[id_abundances != "29",], clean_condition, 
                     ncomp = nlevels(clean_condition), multilevel = clean_id,
                     keepX = sel_keepX, scale=FALSE)
## Splitting the variation for 1 level factor.
plot('n',xlim=c(-4,9),ylim=c(-5,5))
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduits lors de la
## conversion automatique

plotIndiv(res_splsda, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, 
          legend = TRUE, title = 'sPLS-DA Comp 1 - 2',style="graphics")
suppIndiv <- log_commun_ctrl%*%res_splsda$loadings$X
chose = (log_commun_sick[id_abundances!="29", ]) %*% res_splsda$loadings$X
points(suppIndiv[,1],suppIndiv[,2],col=ifelse(condition_ctrl=='EN',"dodgerblue2","darkorange"),pch=16,cex=1.5)

plotLoadings(res_splsda, comp = 1, method = 'mean', contrib = 'max',size.title = 1)

truc = cbind(data_commun_ctrl, data_commun_sick)
truc = data.frame(t(truc), cond = paste0(c(condition_ctrl,condition), c(rep("C", 12), rep("S", 45))))
boxplot(log(truc$Streptococcus.pluranimalium + 1) ~ truc$cond)

boxplot(log(truc$Pseudomonas.sp. + 1) ~ truc$cond)

boxplot(log(truc$Mannheimia.sp. + 1) ~ truc$cond)

sPLS with regression mode - EN condition

sel_samples <- which(condition == "EN")
set.seed(1205)
spls <- spls(X = log(t(abundances)+1)[sel_samples, ], 
             sapply(pathogenes[sel_samples, ], as.numeric) - 1, ncomp = 2, 
             mode = "regression", keepX = c(10,10), keepY = c(7,7))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(spls, comp = 1:2, rep.space= 'XY-variate', 
          ind.names = id_abundances[sel_samples], legend = F, 
          title = 'sPLS comp 1 - 2, XY-space',size.title = rel(2),
          size.ylabel = rel(1.5), size.xlabel = rel(1.5),cex=5)

plotVar(spls, comp = 1:2, var.names = list(X.label = TRUE, Y.label = TRUE), 
        cex = c(4, 6))

sPLS with regression mode - LBA condition

sel_samples <- which(condition == "LBA")
set.seed(1205)
spls <- spls(X = log(t(abundances)+1)[sel_samples, ], 
             sapply(pathogenes[sel_samples, ], as.numeric) - 1, ncomp = 2, 
             mode = "regression", keepX = c(10,10), keepY = c(7,7))
## Warning in cor(A[[k]], variates.A[[k]]): l'écart type est nulle
plotIndiv(spls, comp = 1:2, rep.space= 'XY-variate', 
          ind.names = id_abundances[sel_samples], legend = F, 
          title = 'sPLS comp 1 - 2, XY-space',size.title = rel(2),
          size.ylabel = rel(1.5), size.xlabel = rel(1.5),cex=5)

plotVar(spls, comp = 1:2, var.names = list(X.label = TRUE, Y.label = TRUE), 
        cex = c(4, 6))